## Data Preparation Replication Script ##
## This file is going to replicate the final data that is used in the "figuresandtables_replicationscript.R"
## This script will pull in each individual yearly datasets (2018, 2019, and 2020), reformat them,
## combine them into a single dataframe, and then merge in the covaraties to create the final data frame. 

#  Load necessary packages ----
rm(list=ls())
library(data.table)
library(tools)
library(dplyr)
library(stargazer)
library(rdrobust)
library(xtable)
library(rddensity)
library(qdap)
library(readxl)
library(ggplot2)
library(gridExtra)

setwd("~/Dropbox (MIT)/RDD_stateleg/Replication Materials")

# Read in 2018 data ----
cands <- data.table(read.csv("RDDcandidates2018_fullycleanedDec2021.csv")) 

cands[partyfull=="democraticfarmerlabor", partyfull:= "democrat"] #rename to treat them like D

cands[, dno := as.character(dno)]
cands[state=="MN", dno := ddezo] #swap in correct field for MN
cands[state=="MA", dno := toTitleCase(tolower(ddezo))] #swap in correct field for MA

cands$uniquedist <- paste(cands$year, cands$state, cands$chamber, cands$dname, cands$dno, cands$geopost, sep=" ") #create one district index
cands$uniqueprim <- paste(cands$year, cands$state, cands$chamber, cands$dname, cands$dno, cands$geopost, cands$partyfull, sep=" ")#same thing but with party included

cands[state %in% c("ID", "WA"),]$uniquedist <- paste(cands[state %in% c("ID", "WA"),]$year, cands[state %in% c("ID", "WA"),]$state, cands[state %in% c("ID", "WA"),]$chamber, cands[state %in% c("ID", "WA"),]$dname, cands[state %in% c("ID", "WA"),]$dno, cands[state %in% c("ID", "WA"),]$mmdpost, sep=" ") 

cands[state %in% c("ID", "WA"),]$uniqueprim <- paste(cands[state %in% c("ID", "WA"),]$year, cands[state %in% c("ID", "WA"),]$state, cands[state %in% c("ID", "WA"),]$chamber, cands[state %in% c("ID", "WA"),]$dname, cands[state %in% c("ID", "WA"),]$dno, cands[state %in% c("ID", "WA"),]$mmdpost, cands[state %in% c("ID", "WA"),]$partyfull, sep=" ")

#also, mark open-seat races (no incumbent from either party)
cands[, incumbentrunning:=max(.SD$incumbent, na.rm=T), by=uniquedist]

# pull in district counts:
districtsize <- data.table(read.csv("MMD_Data_BF19May2020.csv", stringsAsFactors=F))
districtsize[dname=="", dname:= NA] #line up formatting with main file
districtsize$year <- 2018
districtsize$uniquedist <- paste(districtsize$year, districtsize$state, districtsize$chamber, districtsize$dname, districtsize$dno, districtsize$geopost, sep=" ")
#add this "dseats" column to the main dataset
cands[, dseats:= districtsize$dseats[match(uniquedist, districtsize$uniquedist)]]

cands[chamber=="Lower" & state %in% c("AZ", "NJ", "ND", "SD",  "WA", "VT"), mmd2 := 1 ] #flag those districts

#see if the 2nd- and 3rd-place finishers are of diff races
mmd2 <- cands[mmd2==1,]; dim(mmd2) 

mmd2[, primarycount:= .N, by=list(uniquedist, partyfull)] #how many in the primary?
contestedmmd <- mmd2[primarycount>2,] #drop primaries with only two people 
setkey(contestedmmd, uniquedist, primaryvotecount) #sort by votecounts
contestedmmd[, voteorder:= .N:1, by=list(uniquedist, partyfull)] 
contestedmmd_top3 <- contestedmmd[voteorder <=3,] #keep just the top three candidates 
contestedmmd_reshape <- reshape(contestedmmd_top3, idvar = c("uniqueprim"), timevar = "voteorder", direction = "wide") #one row per race 
##see cases where cands 2/3 are of different races 
contestedmmd_reshape[, diffraces:= 1]
contestedmmd_reshape[race.2==race.3, diffraces:= 0]
mmdstokeep <- contestedmmd_reshape[diffraces==1,] 

cands[mmd2==1, mmd2keep:=0] #set up a flag
cands[mmd2==1 & uniquedist %in% mmdstokeep$uniquedist.1, mmd2keep :=1]#flag the mmd primary districts we actually want
cands <- cands[is.na(mmd2keep)==T | mmd2keep==1,]; dim(cands)

#drop the more-than-two-per-district places
cands[, dropchamb:=0]
cands[state %in% c("MD","NH"), dropchamb:=1 ]
cands[chamber=="Lower" & state %in% c("WV"), dropchamb:=1 ]
cands[chamber=="Upper" & state %in% c("VT"), dropchamb:=1 ]
cands <- cands[!(dropchamb==1),]; dim(cands) 

cands <- cands[!(state %in% c("CA", "LA", "NE", "WA")),] #dropping the top-two primary states

## start looking at close races
#for RDD-able primaries, how close was the win?
rddprim <- cands
rddprim[, primarycount:= .N, by=list(uniquedist, partyfull)] #count how many candidates in each primary
rddprim <- rddprim[rddprim$primarycount>1,]; dim(rddprim) #drop if single-cand. primary
rddprim[,primaryvotes:= as.numeric(primaryvotecount)]
summary(rddprim$primaryvotes)

fixmissingness <- read.csv("candidatesmissing2018primaryvotes_sept2021_EJfilled.csv") #pull in manually-searched fixes (lots turned out to have withdrawn)
fixmissingness$primaryvotes <- as.numeric(fixmissingness$primaryvotecount)
fixmissingness[fixmissingness$uniqueprim=="2018 AL Upper NA 7 NA democrat","primaryvotes"] <- NA #this one is a losing runoff cand & shouldn't have been coded.
fixmissingness[fixmissingness$uniqueprim=="2018 GA Lower NA 144 NA democrat","primaryvotes"] <- NA #this one is a losing runoff cand & shouldn't have been coded.
fixmissingness[fixmissingness$uniqueprim=="2018 AL Lower NA 54 NA democrat","primaryvotes"] <- NA #this one is a losing runoff cand & shouldn't have been coded.

fullmissingnessfixes <- fixmissingness 
fixmissingness <- fixmissingness[is.na(fixmissingness$primaryvotes)==F, ]; dim(fixmissingness) #keep only those with votes, to swap in values
summary(rddprim$primaryvotes)

rddprim[fullname %in% fixmissingness$fullname, primaryvotes:= fixmissingness$primaryvotes[match(fullname, fixmissingness$fullname)]]
summary(rddprim$primaryvotes)

## next, calculate how many points the top vote-getter outstripped the next one by? (allow for >2 candidates)
rddprim[, primvoteshare := primaryvotes / sum(primaryvotes, na.rm=T), by=list(uniquedist, partyfull)] #calculate everyone's voteshare
setkey(rddprim, uniquedist, partyfull, primvoteshare)

rddprim[, voteorder:= .N:1, by=list(uniquedist, partyfull)] #label in order of most votes won
rddprim_top2 <- rddprim[voteorder<3 & is.na(mmd2)==T,] #keep just the top 2, and set the MMD cases aside to handle separately 
#now go in and label mismatched-race contests
rddprim_top2[, diffrace:=ifelse(.SD$race[1]==.SD$race[2], 0, 1), by=list(uniquedist, partyfull)]
rddprim_top2 <- rddprim_top2[diffrace==1,] #keep only the ones where top-2 candidates' race fields don't match 
rddprim_top2[, primwinmargin:= .SD$primvoteshare[2] - .SD$primvoteshare[1] , by=list(uniquedist, partyfull)]
missingprimmargin <- rddprim_top2[is.na(primwinmargin),]; dim(missingprimmargin)

# drop primaries that didn't actually occur 
rddprim_top2 <- rddprim_top2[is.na(primwinmargin)==F,]; dim(rddprim_top2) 

races <- rddprim_top2[rddprim_top2$race=="", ] 
rddprim_top2[uniqueprim %in% races$uniqueprim,] 
rddprim_top2[, missingrace:=0]
rddprim_top2[uniqueprim %in% races$uniqueprim,"missingrace"] <- 1

##for the RDD, we'll ultimately want an election-level dataset. 
#start by calculating the "white candidate win margin": how much the white candidate won/lost the primary by
rddprim_top2[, whiteprimarymargin:= .SD[race=="White"]$primvoteshare - .SD[!(race=="White")]$primvoteshare , by=list(uniquedist, partyfull)]
summary(rddprim_top2$whiteprimarymargin)

rddprim_top2[, RDDLatino:=0]; rddprim_top2[, RDDBlack:=0] #create some flags that will let us subset by group
rddprim_top2[, RDDLatino:= ifelse("Latino" %in% .SD$race, 1, 0) , by=uniqueprim]
rddprim_top2[, RDDBlack:= ifelse("Black" %in% .SD$race, 1, 0) , by=uniqueprim]
rddprim_top2[, RDDAsian:= ifelse("Asian American" %in% .SD$race, 1, 0) , by=uniqueprim]
rddprim_top2[is.na(whiteprimarymargin)==T, RDDLatino:=NA] #set these to missing if there's no white candidate (since they will drop from analysis as well)
rddprim_top2[is.na(whiteprimarymargin)==T, RDDBlack:=NA] #set these to missing if there's no white candidate (since they will drop from analysis as well)
rddprim_top2[is.na(whiteprimarymargin)==T, RDDAsian:=NA]

#also, want to carry through whether the party nominates an incumbent or not, and a woman or not
rddprim_top2[, incumbentwins:=0]
rddprim_top2[, incumbentwins:= ifelse(.SD[incumbent==1]$voteorder==1, 1, 0), by=list(uniquedist, partyfull)]
summary(rddprim_top2$incumbentwins)
rddprim_top2[, womanwins:=0]
rddprim_top2[, womanwins:= ifelse(.SD[gender=="W"]$voteorder==1, 1, 0), by=list(uniquedist, partyfull)]
summary(rddprim_top2$womanwins)
rddprim_top2[, womanwins:= max(womanwins), by=list(uniquedist, partyfull)] #make sure both obs from each prim get this
summary(rddprim_top2$womanwins)
rddprim_top2[, minorityincumbent:= ifelse(.SD[!(race=="White")]$incumbent==1, 1, 0), by=list(uniquedist, partyfull)]
rddprim_top2[, minorityincumbent:= max(minorityincumbent), by=list(uniquedist, partyfull)] #make sure both obs from each prim get this

##check which party is the one with the relevant primary, and calc their share of the 2-party voteshare in the general
generals <- cands[!(is.na(generalvotecount)), ] #keep only the general election cands 
setkey(generals, uniquedist, partyfull)

generalstrim <- subset(generals, select=c("uniquedist", "partyfull", "generalvotecount", "fullname", "mmd2keep", "generaloutcome")) #trim down, then reshape 
generalstrim[, partycount:= .N, by=c("uniquedist", "partyfull")]
generalstrim <- generalstrim[is.na(mmd2keep),] #drop the MMDs for now (deal with them below) 
setorder(generalstrim, uniquedist, partyfull, -generalvotecount) #sort so the first obs is always the biggest vote-getter from party 

general <- reshape(generalstrim, idvar = c("uniquedist"), timevar = "partyfull", direction = "wide") 
general[, dem2p := generalvotecount.democrat/(generalvotecount.democrat+generalvotecount.republican)] #calc 2party vote share 
general[, rep2p := generalvotecount.republican/(generalvotecount.democrat+generalvotecount.republican)]
general[, reptotal := generalvotecount.republican]
general[, demtotal := generalvotecount.democrat]
general[, demwin:= ifelse(generaloutcome.democrat=="w", 1, 0)] #also pull through winning party
general[, repwin:= ifelse(generaloutcome.republican=="w", 1, 0)]
#set up an alternate measure where  uncontested generals yield 100% vote share for winners, not NA
general[, dem2palt:= dem2p]
general[, rep2palt:= rep2p]
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(generalvotecount.republican) & generalvotecount.democrat>0 , uncontestedgen := 1]
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(generalvotecount.democrat) & generalvotecount.republican>0 , uncontestedgen := 1]
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(generalvotecount.republican) & generalvotecount.democrat>0 , dem2palt := 1] 
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(generalvotecount.republican) & generalvotecount.democrat>0 , rep2palt := 0] 
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(generalvotecount.democrat) & generalvotecount.republican>0 , dem2palt := 0] 
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(generalvotecount.democrat) & generalvotecount.republican>0 , rep2palt := 1] 

generaltrim <- subset(general, select=c("uniquedist", "rep2p", "dem2p","rep2palt", "dem2palt", "repwin","demwin", "reptotal", "demtotal", "uncontestedgen")) 
rddmerge <- merge(rddprim_top2, generaltrim, by=c("uniquedist")) 

generaltrim <- subset(general, select=c("uniquedist", "rep2p", "dem2p","rep2palt", "dem2palt", "repwin","demwin", "reptotal", "demtotal", "uncontestedgen"))
rddmerge <- merge(rddprim_top2, generaltrim, by=c("uniquedist")) 

# 2018 Multi-member district primaries ----
## Have to handle them separately because their cutoffs/general vote counts are different. 
rddprim_top3 <- rddprim[voteorder<4 & mmd2==1,] #keep just the top 3 for MMD cases 
#label mismatched-race contests (between #2 and #3)
rddprim_top3[, diffrace:=ifelse(.SD$race[2]==.SD$race[3], 0, 1), by=list(uniquedist, partyfull)]
rddprim_top3 <- rddprim_top3[diffrace==1,] #keep only the ones with diff races 
rddprim_top3[, primwinmargin:= .SD$primvoteshare[2] - .SD$primvoteshare[1] , by=list(uniquedist, partyfull)] 
summary(rddprim_top3$primwinmargin) 

rddprim_top3[, whiteprimarymargin:= .SD[(voteorder > 1 & race=="White")]$primvoteshare - .SD[(voteorder > 1) & !(race=="White")]$primvoteshare , by=list(uniquedist, partyfull)]

#also set up flags for incumbent/woman winning.
rddprim_top3[, incumbentwins:=0]
rddprim_top3[, incumbentwins:= ifelse(.SD[incumbent==1]$voteorder==2, 1, 0), by=list(uniquedist, partyfull)]
summary(rddprim_top3$incumbentwins)
rddprim_top3[, womanwins:=0]
rddprim_top3[, womanwins:= ifelse(.SD[voteorder==2]$gender=="W", 1, 0), by=list(uniquedist, partyfull)]
summary(rddprim_top3$womanwins)
rddprim_top3[, womanwins:= max(womanwins), by=list(uniquedist, partyfull)] #make sure all obs from each prim get this
summary(rddprim_top3$womanwins)

##then we want to check which party is the one with the relevant primary, and calc their share of the 2-party voteshare in the general (this time across top 2 cands from each party)
generalsmmd <- cands[mmd2keep==1 & !(is.na(generalvotecount)), ] #keep only the general election cands
setkey(generalsmmd, uniquedist, partyfull)
generalsmmdtrim <- subset(generalsmmd, select=c("uniquedist", "partyfull", "generalvotecount", "fullname", "generaloutcome")) #trim down, then reshape 
generalsmmdtrim[, partycount:= .N, by=c("uniquedist", "partyfull")] 
generalsmmdtrim[, partycand:= 1:.N, by=c("uniquedist", "partyfull")] #label cands 1/2 (at random) from each party
generalsmmdtrim[, partycandnum:= paste(partyfull, partycand, sep="")]
generalmmd <- reshape(generalsmmdtrim, idvar = c("uniquedist"), timevar = c("partycandnum"), direction = "wide") #keep one row per race 

generalmmd[, dem2p := (generalvotecount.democrat1 + generalvotecount.democrat2) /(generalvotecount.democrat1 + generalvotecount.democrat2+generalvotecount.republican1+generalvotecount.republican2)] #calc 2party vote share where possible
generalmmd[, demtotal := (generalvotecount.democrat1 + generalvotecount.democrat2)]
generalmmd[, rep2p := (generalvotecount.republican1 + generalvotecount.republican2) /(generalvotecount.democrat1 + generalvotecount.democrat2+generalvotecount.republican1+generalvotecount.republican2)]
generalmmd[, reptotal:= (generalvotecount.republican1 + generalvotecount.republican2)]

generalmmd[, dem2palt:= NA] 
generalmmd[, rep2palt:= NA]

generalmmdtrim <- subset(generalmmd, select=c("uniquedist", "rep2p", "dem2p",  "rep2palt", "dem2palt","reptotal", "demtotal")) 
rddmergeMMD <- merge(rddprim_top3, generalmmdtrim, by=c("uniquedist"))
dim(rddmergeMMD); dim(rddprim_top3)

#then combine with the main rdd dataset, and go back to analysis. 
rddmerge <- rbind(rddmerge, rddmergeMMD, fill=TRUE)

# Set up 2018 dataset for RDD analysis ----
rdset2018 <- unique(rddmerge, by=c("uniquedist", "partyfull")) #keep just one obs from each primary, which now has all needed info
dim(rdset2018)

##now need to see which party had the relevant primary (so which is the correct 2p vote share to use)
rdset2018[partyfull=="democrat", general2p:= dem2p]
rdset2018[partyfull=="republican", general2p:= rep2p]
rdset2018[partyfull=="democrat", generaltotal:= demtotal]
rdset2018[partyfull=="republican", generaltotal:= reptotal]

#also, set up an alternate version not dropping uncontested
rdset2018[partyfull=="democrat", general2palt:= dem2palt]
rdset2018[partyfull=="republican", general2palt:= rep2palt]

#also, set up a binary general win/loss variable
rdset2018[partyfull=="democrat", general2pwin:= demwin]
rdset2018[partyfull=="republican", general2pwin:= repwin]
rdset2018[is.na(dem2p) | is.na(rep2p), general2pwin:= NA]

rdset2018[rdset2018$general2p==1, "general2p"] <- NA
rdset2018[rdset2018$general2p==0, "general2p"] <- NA

# 2019 Data ----
## Conduct same process with 2019 data 

#read in 2019 data for three states
MScands <- data.table(read.csv("MS_2019_RDDelig_Final.csv")) 
NJcands <- data.table(read.csv("NJ_2019_RDDelig_Final.csv")) 
VAcands <- data.table(read.csv("VA_2019_RDDelig_Final.csv")) 
NJcands$state <- "NJ"; MScands$state <- "MS"; VAcands$state <- "VA"
MScands <- MScands[!(FullName==""),] 
NJcands <- NJcands[!(FullName==""),] 
VAcands <- VAcands[!(FullName==""),] 

MScands$seats.per.district <- VAcands$seats.per.district <- 1
NJcands$seats.per.district <- 2 #fill in MMD info for use below

#need to set up unique identifiers 
#of the format: 2018 AL Lower NA 8 NA democrat
MScands$distlabel <- gsub("S", "Upper", gsub("H", "Lower", MScands$OfficeTitle))
MScands$chamber <- ifelse(substr(MScands$OfficeTitle, 0,1)=="S", "Upper", "Lower")
MScands$dname <- MScands$geopost <- NA #these are used in other states
MScands$dno <- as.numeric(gsub(".*?([0-9]+).*", "\\1", MScands$OfficeTitle)) 
MScands$uniquedist <- paste("2019", "MS", MScands$chamber, MScands$dname, MScands$dno, MScands$geopost, sep=" ") #create one district index
MScands$uniqueprim <- paste("2019", "MS", MScands$chamber, MScands$dname, MScands$dno, MScands$geopost, tolower(MScands$Party), sep=" ") #same thing but with party included

#now VA
VAcands$chamber <- ifelse(VAcands$OfficeTitle=="State Senate", "Upper", "Lower")
VAcands$dname <- VAcands$geopost <- NA #these are used in other states
VAcands$dno <- VAcands$District
VAcands$Party <- gsub("Democratic","Democrat", VAcands$Party)
VAcands$uniquedist <- paste("2019", "VA", VAcands$chamber, VAcands$dname, VAcands$District, VAcands$geopost, sep=" ") #create one district index
VAcands$uniqueprim <- paste("2019", "VA", VAcands$chamber, VAcands$dname, VAcands$District, VAcands$geopost, tolower(VAcands$Party), sep=" ") #same thing but with party included

#now NJ
NJcands$chamber <- ifelse(NJcands$OfficeTitle=="General Assembly", "Lower", "Upper")
NJcands$dname <- NJcands$geopost <- NA #these are used in other states
NJcands$dno <- NJcands$District
NJcands$Party <- gsub("Democratic","Democrat", NJcands$Party)
NJcands$uniquedist <- paste("2019", "NJ", NJcands$chamber, NJcands$dname, NJcands$dno, NJcands$geopost, sep=" ") #create one district index
NJcands$uniqueprim <- paste("2019", "NJ", NJcands$chamber, NJcands$dname, NJcands$dno, NJcands$geopost, tolower(NJcands$Party), sep=" ") #same thing but with party included

#so for MS, where there's a runoff, replace the primary votes with the primary runoff votes. But if no runoff, just keep primary votes (by district-party)
MScands[, runoffval:= max(.SD$PrimaryRunoffVotes, na.rm=T), by=uniqueprim]
MScands[, runoff := ifelse(is.na(runoffval), 0,1)]
MScands[runoff==1, PrimaryVotes := PrimaryRunoffVotes]

NJcands$runoff <- VAcands$runoff <- 0 #no runoffs here
NJcands$PrimaryVotes <- NJcands$Primaryvotes
NJcands$GeneralVotes <- NJcands$Generalvotes

NJcands$incumbent <- NJcands$incumbent..1.0.; MScands$incumbent<- MScands$Incumbent..1.0.; VAcands$incumbent <- VAcands$Incumbent..1.0.
colstokeep <- c("Party", "FullName", "state", "dno","Race", "Gender", "incumbent","PrimaryVotes", "GeneralVotes", "chamber", "uniquedist", "uniqueprim", "runoff", "seats.per.district")
cands19 <- rbind(MScands[, ..colstokeep], NJcands[, ..colstokeep], VAcands[, ..colstokeep]); dim(cands19)
cands <- cands19

#mark open-seat races (no incumbent from either party)
cands[, incumbentrunning:=max(.SD$incumbent, na.rm=T), by=uniquedist]

##see if the 2nd- and 3rd-place finishers are of diff races
mmd2 <- cands[seats.per.district==2,]; dim(mmd2) 
mmd2[, primarycount:= .N, by=list(uniquedist, Party)] #how many in the primary?
contestedmmd <- mmd2[primarycount>2,] #drop primaries with only two people
setkey(contestedmmd, uniquedist, PrimaryVotes) #sort by votecounts
contestedmmd[, voteorder:= .N:1, by=list(uniquedist, Party)] 
contestedmmd_top3 <- contestedmmd[voteorder <=3,] #keep just the top three candidates
contestedmmd_reshape <- reshape(contestedmmd_top3, idvar = c("uniqueprim"), timevar = "voteorder", direction = "wide") #one row per race

contestedmmd_reshape[, diffraces:= 1]
contestedmmd_reshape[Race.2==Race.3, diffraces:= 0]
mmdstokeep <- contestedmmd_reshape[diffraces==1,]
##go back into the candidates dataset and flag which ones we'll keep and which we won't. 
cands[seats.per.district==2, mmd2keep:=0] #set up a flag
cands[seats.per.district==2 & uniquedist %in% mmdstokeep$uniquedist.1, mmd2keep :=1]#flag the mmd primaries we actually want
cands <- cands[is.na(mmd2keep)==T | mmd2keep==1,]; dim(cands)

##still go ahead and drop the more-than-two-per-district places
cands <- cands[!(seats.per.district >2),]; dim(cands) #not actually relevant in 2019

rddprim <- cands # we'll trim down directly to ensure contested primaries across race
rddprim[, primarycount:= .N, by=list(uniquedist, Party)] #count how many candidates in each primary
rddprim <- rddprim[primarycount>1,];dim(rddprim)
summary(as.numeric(rddprim$PrimaryVotes))

rddprim <- rddprim[!(is.na(PrimaryVotes)),]; dim(rddprim)

## calculate how many points the top vote-getter outstripped the next one by (allow for >2 candidates)
rddprim[, primvoteshare := PrimaryVotes / sum(PrimaryVotes, na.rm=T), by=list(uniquedist, Party)] #calculate everyone's voteshare
setkey(rddprim, uniquedist, Party, primvoteshare)

rddprim[, voteorder:= .N:1, by=list(uniquedist, Party)] #label in order of most votes won
rddprim_top2 <- rddprim[voteorder<3 ,] #keep just the top 2, and set the MMD cases aside to handle separately

rddprim_top2[, diffrace:=ifelse(.SD$Race[1]==.SD$Race[2], 0, 1), by=list(uniquedist, Party)]
rddprim_top2[, diffrace:=ifelse((.SD$Race[1]=="" | .SD$Race[2]==""), 0, diffrace), by=list(uniquedist, Party)]

rddprim_top2 <- rddprim_top2[diffrace==1,] #keep only the ones where top-2 candidates' race fields don't match
rddprim_top2[, primwinmargin:= .SD$primvoteshare[2] - .SD$primvoteshare[1] , by=list(uniquedist, Party)]
summary(rddprim_top2$primwinmargin) 

rddprim_top2[, missingrace:=0] #add in a flag to line up w/2018 data (but no blank cases here)

##for the RDD, we'll ultimately want an election-level dataset. 
#calculate the "white candidate win margin": how much the white candidate won/lost the primary by
rddprim_top2[, whiteprimarymargin:= .SD[Race=="White"]$primvoteshare - .SD[!(Race=="White")]$primvoteshare , by=list(uniquedist, Party)]
summary(rddprim_top2$whiteprimarymargin)

rddprim_top2[, RDDLatino:=0]; rddprim_top2[, RDDBlack:=0] #create some flags that will let us subset by group
rddprim_top2[, RDDLatino:= ifelse("Latino" %in% .SD$Race, 1, 0) , by=uniqueprim]
table(rddprim_top2$RDDLatino)
rddprim_top2[, RDDBlack:= ifelse("Black" %in% .SD$Race, 1, 0) , by=uniqueprim]
table(rddprim_top2$RDDBlack)
rddprim_top2[, RDDAsian:= ifelse("Asian" %in% .SD$Race, 1, 0) , by=uniqueprim]

rddprim_top2[is.na(whiteprimarymargin)==T, RDDLatino:=NA] #set these to missing if there's no white candidate (since they will drop from analysis as well)
rddprim_top2[is.na(whiteprimarymargin)==T, RDDBlack:=NA] #set these to missing if there's no white candidate (since they will drop from analysis as well)
rddprim_top2[is.na(whiteprimarymargin)==T, RDDAsian:=NA] 
table(rddprim_top2$RDDBlack, exclude=NULL)
table(rddprim_top2$RDDLatino, exclude=NULL) 
table(rddprim_top2$RDDAsian, exclude=NULL) 

#also, want to carry through whether the party nominates an incumbent or not, a woman or not
rddprim_top2[, incumbentwins:=0]
rddprim_top2[, incumbentwins:= ifelse(.SD[incumbent==1]$voteorder==1, 1, 0), by=list(uniquedist, Party)]
summary(rddprim_top2$incumbentwins)
rddprim_top2[, womanwins:=0]
rddprim_top2[, womanwins:= ifelse(.SD[Gender=="Female"]$voteorder==1, 1, 0), by=list(uniquedist, Party)]
summary(rddprim_top2$womanwins)
rddprim_top2[, womanwins:= max(womanwins), by=list(uniquedist, Party)] #make sure both obs from each prim get this
summary(rddprim_top2$womanwins)
#also carry through whether the minority candidate is incumbent
rddprim_top2[, minorityincumbent:= ifelse(.SD[!(Race=="white")]$incumbent==1, 1, 0), by=list(uniquedist, Party)]
rddprim_top2[, minorityincumbent:= max(minorityincumbent), by=list(uniquedist, Party)] #make sure both obs from each prim get this

##then we want to check which party is the one with the relevant primary, and calc their share of the 2-party voteshare in the general?
generals <- cands[!(is.na(GeneralVotes)), ] #keep only the general election cands

bpgeneral <- generals #just pull our subsetted data through here

setkey(bpgeneral, uniquedist, Party, GeneralVotes)
bpgeneral[, partycount:= .N, by=c("uniquedist", "Party")] 

generalstrim <- subset(bpgeneral, select=c("uniquedist", "Party", "GeneralVotes", "FullName","partycount")) #trim down, then reshape

setorder(generalstrim, uniquedist, Party, -GeneralVotes)

general <- reshape(generalstrim, idvar = c("uniquedist"), timevar = "Party", direction = "wide") #one row per race
general[, dem2p := GeneralVotes.Democrat/(GeneralVotes.Democrat+GeneralVotes.Republican)] #calc 2party vote share where possible 
general[, rep2p := GeneralVotes.Republican/(GeneralVotes.Democrat+GeneralVotes.Republican)]
general[, reptotal := GeneralVotes.Republican]
general[, demtotal := GeneralVotes.Democrat]
general[, demwin:= ifelse(dem2p>.5, 1, 0)] #in this dataset, just pulling from vote counts, since we don't have "status" var
general[, repwin:= ifelse(rep2p>.5, 1, 0)]
#alternate version: fix it so that uncontested generals yield 100% vote share for winners, not NA
general[, dem2palt:= dem2p]
general[, rep2palt:= rep2p]
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(GeneralVotes.Republican) & GeneralVotes.Democrat>0 , uncontestedgen := 1] 
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(GeneralVotes.Democrat) & GeneralVotes.Republican>0 , uncontestedgen := 1]
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(GeneralVotes.Republican) & GeneralVotes.Democrat>0 , dem2palt := 1] 
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(GeneralVotes.Republican) & GeneralVotes.Democrat>0 , rep2palt := 0] 
general[(is.na(dem2palt)|is.na(rep2palt))& is.na(GeneralVotes.Democrat) & GeneralVotes.Republican>0 , dem2palt := 0] 
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(GeneralVotes.Democrat) & GeneralVotes.Republican>0 , rep2palt := 1] 

generaltrim <- subset(general, select=c("uniquedist", "rep2p", "dem2p", "rep2palt", "dem2palt", "repwin", "demwin", "reptotal", "demtotal", "partycount.Democrat", "partycount.Republican", "uncontestedgen"))

rddmerge <- merge(rddprim_top2, generaltrim, by=c("uniquedist"), all.x=T) #want to see  which primaries we don't have generals for.
dim(rddmerge); dim(rddprim_top2) 

rddmerge <- merge(rddprim_top2, generaltrim, by=c("uniquedist")) 
dim(rddmerge); dim(rddprim_top2) 

rddmerge <- rddmerge[is.na(rddmerge$mmd2keep)==T,] #drop the MMDs since we'll handle them separately

# 2019 Multi-member district primaries ----
### handle separately because their cutoffs/general vote counts are different. 
rddprim_top3 <- rddprim[voteorder<4 & mmd2keep==1,] #keep just the top 3 for MMD cases
##now go in and label mismatched-race contests (between #2 and #3)
rddprim_top3[, diffrace:=ifelse(.SD$Race[2]==.SD$Race[1], 0, 1), by=list(uniquedist, Party)]
rddprim_top3 <- rddprim_top3[diffrace==1,] #keep only the ones with diff races
rddprim_top3[, primwinmargin:= .SD$primvoteshare[2] - .SD$primvoteshare[1] , by=list(uniquedist, Party)] #(this compares cand 2 to cand 3--sort order is reversed)
summary(rddprim_top3$primwinmargin) 

rddprim_top3[, whiteprimarymargin:= .SD[(voteorder > 1 & Race=="White")]$primvoteshare - .SD[(voteorder > 1) & !(Race=="White")]$primvoteshare , by=list(uniquedist, Party)]
summary(rddprim_top3$whiteprimarymargin)

#also, want to carry through whether the party nominates an incumbent or not, a woman or not
rddprim_top3[, incumbentwins:=0]
rddprim_top3[, incumbentwins:= ifelse(.SD[voteorder==2]$incumbent==1, 1, 0), by=list(uniquedist, Party)]
summary(rddprim_top3$incumbentwins)
rddprim_top3[, womanwins:=0]
rddprim_top3[, womanwins:= ifelse(.SD[voteorder==2]$Gender=="Female", 1, 0), by=list(uniquedist, Party)]
summary(rddprim_top3$womanwins)
rddprim_top3[, womanwins:= max(womanwins), by=list(uniquedist, Party)] #make sure both obs from each prim get this
summary(rddprim_top3$womanwins)

### check which party is the one with the relevant primary, and calc their share of the 2-party voteshare in the general (this time across top 2 cands from each party)
unique(rddprim_top3$uniquedist)
generalstrim_mmd <- generalstrim[generalstrim$uniquedist %in% rddprim_top3$uniquedist, ]; dim(generalstrim_mmd)

generalstrim_mmd[, partycand:= 1:.N, by=c("uniquedist", "Party")] #label cands 1/2 (at random) from each party
generalstrim_mmd[, partycandnum:= paste(Party, partycand, sep="")]

generalmmd <- reshape(generalstrim_mmd, idvar = c("uniquedist"), timevar = "partycandnum", direction = "wide") #one row per race
generalmmd[, dem2p := (GeneralVotes.Democrat1 + GeneralVotes.Democrat2) /(GeneralVotes.Democrat1 + GeneralVotes.Democrat2+GeneralVotes.Republican1 + GeneralVotes.Republican2)] #calc 2party vote share where possible
generalmmd[, rep2p := (GeneralVotes.Republican1 + GeneralVotes.Republican2) /(GeneralVotes.Democrat1 + GeneralVotes.Democrat2+GeneralVotes.Republican1 + GeneralVotes.Republican2)]
generalmmd[, demtotal := (GeneralVotes.Democrat1 + GeneralVotes.Democrat2)]
generalmmd[, reptotal := (GeneralVotes.Republican1 + GeneralVotes.Republican2)]

generalmmd[, dem2palt:= dem2p]; generalmmd[, rep2palt:= rep2p]
generalmmdtrim <- subset(generalmmd, select=c("uniquedist", "rep2p", "dem2p", "rep2palt", "dem2palt", "reptotal", "demtotal"))

rddmergeMMD <- merge(rddprim_top3, generalmmdtrim, by=c("uniquedist"))
dim(rddmergeMMD); dim(rddprim_top3)

##then combine with the main rdd dataset, and go back to analysis. 
rddmerge <- rbind(rddmerge, rddmergeMMD, fill=TRUE) 

# Set up 2019 dataset for RDD analysis ----
rdset2019 <- unique(rddmerge, by=c("uniquedist", "Party")) #keep just one obs from each primary, which now has all needed info
dim(rdset2019)
summary(rdset2019$whiteprimarymargin)

##now need to see which party had the relevant primary (so which is the correct 2p vote share to use)
rdset2019[Party=="Democrat", general2p:= dem2p]
rdset2019[Party=="Republican", general2p:= rep2p]

#also, set up an alternate version not dropping uncontested
rdset2019[Party=="Democrat", general2palt:= dem2palt]
rdset2019[Party=="Republican", general2palt:= rep2palt]

#also, set up a binary general win/loss variable
rdset2019[Party=="Democrat", general2pwin:= demwin]
rdset2019[Party=="Republican", general2pwin:= repwin]
rdset2019[is.na(dem2p) | is.na(rep2p), general2pwin:= NA]
summary(rdset2019$general2p); summary(rdset2019$general2pwin)
summary(rdset2019$general2palt)

rdset2019[Party=="Democrat", generaltotal:= demtotal] #also pull through total vote counts
rdset2019[Party=="Republican", generaltotal:= reptotal]

# 2020 Data ----
### Okay, now the same setup process (with some adjustments due to file format) for 2020 data

#read in coded/cleaned 2020 data
cands <- data.table(read.csv("RDDcandidates2020_fullycleanedNov2021.csv"))

#also, mark open-seat races (no incumbent from either party)
cands[, incumbentrunning:=max(.SD$incumbent, na.rm=T), by=uniquedist]

# see if the 2nd- and 3rd-place finishers are of diff races
mmd2 <- cands[seats.per.district==2,]; dim(mmd2)
mmd2[, primarycount:= .N, by=list(uniquedist, party2)] #how many in the primary?
contestedmmd <- mmd2[primarycount>2,] #drop primaries with only two people
setkey(contestedmmd, uniquedist, bpprimaryvotes) #sort by votecounts
contestedmmd[, voteorder:= .N:1, by=list(uniquedist, party2)] 
contestedmmd_top3 <- contestedmmd[voteorder <=3,] #keep just the top three candidates
contestedmmd_reshape <- reshape(contestedmmd_top3, idvar = c("uniqueprim"), timevar = "voteorder", direction = "wide") #one row per race
###so now each row is one race and the column names (.1, .2, .3) tell us the person's vote order
###see cases where cands 2/3 are of different races 
contestedmmd_reshape[, diffraces:= 1]
contestedmmd_reshape[canrace.2==canrace.3, diffraces:= 0]
mmdstokeep <- contestedmmd_reshape[diffraces==1,]
cands[seats.per.district==2, mmd2keep:=0] #set up a flag
cands[seats.per.district==2 & uniqueprim %in% mmdstokeep$uniqueprim, mmd2keep :=1]#flag the mmd primaries we actually want
cands <- cands[is.na(mmd2keep)==T | mmd2keep==1,]; dim(cands)

##drop the more-than-two-per-district places
cands <- cands[!(seats.per.district >2),]; dim(cands)

cands <- cands[!(state %in% c("CA", "LA", "NE", "WA")),] #for now, just dropping the top-two primary states
dim(cands)

rddprim <- cands #ignore that variable: we'll trim down directly to ensure contested primaries across race
rddprim[, primarycount:= .N, by=list(uniquedist, party2)] #count how many candidates in each primary
table(rddprim$primarycount) #hmm, lots of many-candidate races. 
rddprim <- rddprim[primarycount>1,];dim(rddprim)

rddprim <- rddprim[!(is.na(bpprimaryvotes)),]; dim(rddprim)

## calculate how many points the top vote-getter outstripped the next one by? (allow for >2 candidates)
rddprim[, primvoteshare := bpprimaryvotes / sum(bpprimaryvotes, na.rm=T), by=list(uniquedist, party2)] #calculate everyone's voteshare
setkey(rddprim, uniquedist, party2, primvoteshare)

rddprim[, voteorder:= .N:1, by=list(uniquedist, party2)] #label in order of most votes won
rddprim_top2 <- rddprim[voteorder<3 ,] #keep just the top 2, and set the MMD cases aside to handle separately
#now go in and label mismatched-race contests
rddprim_top2[, diffrace:=ifelse(.SD$canrace[1]==.SD$canrace[2], 0, 1), by=list(uniquedist, party2)]

rddprim_top2[, diffrace:=ifelse((.SD$canrace[1]=="" | .SD$canrace[2]==""), 0, diffrace), by=list(uniquedist, party2)]
table(rddprim_top2$diffrace) 
rddprim_top2[, diffrace:=ifelse((.SD$canrace[1]=="unknown" | .SD$canrace[2]=="unknown"), 0, diffrace), by=list(uniquedist, party2)]
table(rddprim_top2$diffrace) 

rddprim_top2 <- rddprim_top2[diffrace==1,] #keep only the ones where top-2 candidates' race fields don't match
rddprim_top2[, primwinmargin:= .SD$primvoteshare[2] - .SD$primvoteshare[1] , by=list(uniquedist, party2)]
summary(rddprim_top2$primwinmargin) 

rddprim_top2[, missingrace:=0] #add in a flag to line up w/2018 data (but no blank cases here)

##for the RDD, we'll ultimately want an election-level dataset. 
#calculate the "white candidate win margin": how much the white candidate won/lost the primary by
rddprim_top2[, whiteprimarymargin:= .SD[canrace=="white"]$primvoteshare - .SD[!(canrace=="white")]$primvoteshare , by=list(uniquedist, party2)]
summary(rddprim_top2$whiteprimarymargin)

rddprim_top2[, RDDLatino:=0]; rddprim_top2[, RDDBlack:=0] #create some flags that will let us subset by group
rddprim_top2[, RDDLatino:= ifelse("latino" %in% .SD$canrace, 1, 0) , by=uniqueprim]
table(rddprim_top2$RDDLatino)
rddprim_top2[, RDDBlack:= ifelse("black" %in% .SD$canrace, 1, 0) , by=uniqueprim]
table(rddprim_top2$RDDBlack)
rddprim_top2[, RDDAsian:= ifelse("asian" %in% .SD$canrace, 1, 0) , by=uniqueprim]

rddprim_top2[is.na(whiteprimarymargin)==T, RDDLatino:=NA] #set these to missing if there's no white candidate (since they will drop from analysis as well)
rddprim_top2[is.na(whiteprimarymargin)==T, RDDBlack:=NA] #set these to missing if there's no white candidate (since they will drop from analysis as well)
rddprim_top2[is.na(whiteprimarymargin)==T, RDDAsian:=NA] 
table(rddprim_top2$RDDBlack, exclude=NULL)
table(rddprim_top2$RDDLatino, exclude=NULL) 
table(rddprim_top2$RDDAsian, exclude=NULL) 

#also, want to carry through whether the party nominates an incumbent or not, a woman or not
rddprim_top2[, incumbentwins:=0]
rddprim_top2[, incumbentwins:= ifelse(.SD[incumbent==1]$voteorder==1, 1, 0), by=list(uniquedist, party2)]
summary(rddprim_top2$incumbentwins)
rddprim_top2[, womanwins:=0]
rddprim_top2[, womanwins:= ifelse(.SD[gender=="Woman"]$voteorder==1, 1, 0), by=list(uniquedist, party2)]
summary(rddprim_top2$womanwins)
rddprim_top2[, womanwins:= max(womanwins), by=list(uniquedist, party2)] #make sure both obs from each prim get this
summary(rddprim_top2$womanwins)
#also carry through whether the minority candidate is incumbent
rddprim_top2[, minorityincumbent:= ifelse(.SD[!(canrace=="white")]$incumbent==1, 1, 0), by=list(uniquedist, party2)]
rddprim_top2[, minorityincumbent:= max(minorityincumbent), by=list(uniquedist, party2)] #make sure both obs from each prim get this

##then we want to check which party is the one with the relevant primary, and calc their share of the 2-party voteshare in the general
generals <- cands[!(is.na(votesfor)), ] #keep only the general election cands

#Merge in the Ballotpedia results. 
ballotpedia <- data.table(read.csv("Ballotpedia data for Paru Shah (2020 State Legislative Candidates and votes) (1).csv"))
bpgeneral <- ballotpedia[ballotpedia$Stage=="General",]; dim(bpgeneral)
bpgeneral[, democrat:= 0];bpgeneral[grepl("Democratic", Party.name)==T, democrat:= 1] ; sum(bpgeneral$democrat) #accounts for multi-party endorsements
bpgeneral[, republican:= 0];bpgeneral[grepl("Republican", Party.name)==T, republican:= 1] ; sum(bpgeneral$republican)
bpgeneral[, partyclean := "OTH"] #follow same basic setup as Eric in main file: just three values
bpgeneral[democrat ==1, partyclean:="DEM"]
bpgeneral[republican ==1, partyclean:="GOP"]
bpgeneral[democrat ==1 & republican==1, partyclean:="OTH"] #reclassify that one person who got endorsed by both since that's weird

#pull in the hand-corrected district names file 
distcorrections <- read.csv("districtsneedinggeneralresultsfromBP_july2021_EJfilled.csv")
#I want to use this to correct the "uniquedist" column in the generaltrim object before concatenating/merging 
distcorrections$newuniquedist <-  paste("2020", distcorrections$state, distcorrections$upperchamber, trimws(distcorrections$Seat))

#need to set up district name to try to have them merge to the main file. 
bpgeneral[, district:= trimws(gsub("District ", "", Seat)) ]
bpgeneral[, upperchamber:=1]; bpgeneral[Chamber=="Lower", upperchamber:=0]; table(bpgeneral$upperchamber)
bpgeneral[, uniquedist:= paste("2020", State, upperchamber, trimws(district))]

dim(bpgeneral[uniquedist %in% distcorrections$newuniquedist,])
bpgeneral[uniquedist %in% distcorrections$newuniquedist, uniquedist:= trimws(distcorrections$uniquedist[match(uniquedist, distcorrections$newuniquedist)])] #swap in corrections where available

bpgeneral <- bpgeneral[is.na(Votes.for)==F,]; dim(bpgeneral) #drop the obs with no vote totals

setkey(bpgeneral, uniquedist, partyclean, Votes.for)
bpgeneral[, partycount:= .N, by=c("uniquedist", "partyclean")] #look for weird duplicates (multiple of the same party in general)

generalstrim <- subset(bpgeneral, select=c("uniquedist", "partyclean", "Votes.for", "Candidate.name", "Candidate.status", "partycount")) #trim down, then reshape

setorder(generalstrim, uniquedist, partyclean, -Votes.for) #sort so the first obs is always the biggest vote-getter from party (assume genuine nominee)

general <- reshape(generalstrim, idvar = c("uniquedist"), timevar = "partyclean", direction = "wide") #one row per race
general[, dem2p := Votes.for.DEM/(Votes.for.DEM+Votes.for.GOP)] #calc 2party vote share where possible
general[, rep2p := Votes.for.GOP/(Votes.for.DEM+Votes.for.GOP)]
general[, reptotal := Votes.for.GOP]
general[, demtotal := Votes.for.DEM]
general[, demwin:= ifelse(Candidate.status.DEM=="Won", 1, 0)] #also pull through winning party
general[, repwin:= ifelse(Candidate.status.GOP=="Won", 1, 0)]
#alternate version: fix it so that uncontested generals yield 100% vote share for winners, not NA
general[, dem2palt:= dem2p]
general[, rep2palt:= rep2p]
general[(is.na(dem2palt)|is.na(rep2palt))& is.na(Votes.for.GOP) & Votes.for.DEM>0 , uncontestedgen := 1] 
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(Votes.for.DEM) & Votes.for.GOP>0 , uncontestedgen := 1]
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(Votes.for.GOP) & Votes.for.DEM>0 , dem2palt := 1] 
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(Votes.for.GOP) & Votes.for.DEM>0 , rep2palt := 0] 
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(Votes.for.DEM) & Votes.for.GOP>0 , dem2palt := 0] 
general[(is.na(dem2palt)|is.na(rep2palt)) & is.na(Votes.for.DEM) & Votes.for.GOP>0 , rep2palt := 1] 
summary(general$dem2palt); summary(general$rep2palt) 

generaltrim <- subset(general, select=c("uniquedist", "rep2p", "dem2p", "rep2palt", "dem2palt", "repwin", "demwin", "reptotal", "demtotal", "partycount.DEM", "partycount.GOP", "uncontestedgen"))

rddmerge <- merge(rddprim_top2, generaltrim, by=c("uniquedist")) #now run it with only the ones that match
dim(rddmerge); dim(rddprim_top2) 

rddmerge <- rddmerge[is.na(rddmerge$mmd2keep)==T,] #drop the MMDs since we'll handle them separately

# 2020 Multi-member district primaries ----
### handle them separately because their cutoffs/general vote counts are different. 
rddprim_top3 <- rddprim[voteorder<4 & mmd2keep==1,] #keep just the top 3 for MMD cases
##now go in and label mismatched-race contests (between #2 and #3)
rddprim_top3[, diffrace:=ifelse(.SD$race[2]==.SD$race[1], 0, 1), by=list(uniquedist, party2)]
rddprim_top3 <- rddprim_top3[diffrace==1,] #keep only the ones with diff races
rddprim_top3[, primwinmargin:= .SD$primvoteshare[2] - .SD$primvoteshare[1] , by=list(uniquedist, party2)] #(this compares cand 2 to cand 3--sort order is reversed)
summary(rddprim_top3$primwinmargin) 

rddprim_top3[, whiteprimarymargin:= .SD[(voteorder > 1 & canrace=="white")]$primvoteshare - .SD[(voteorder > 1) & !(canrace=="white")]$primvoteshare , by=list(uniquedist, party2)]
summary(rddprim_top3$whiteprimarymargin)

#also, want to carry through whether the party nominates an incumbent or not, a woman or not
rddprim_top3[, incumbentwins:=0]
rddprim_top3[, incumbentwins:= ifelse(.SD[voteorder==2]$incumbent==1, 1, 0), by=list(uniquedist, party2)]
summary(rddprim_top3$incumbentwins)
rddprim_top3[, womanwins:=0]
rddprim_top3[, womanwins:= ifelse(.SD[voteorder==2]$gender=="Woman", 1, 0), by=list(uniquedist, party2)]
summary(rddprim_top3$womanwins)
rddprim_top3[, womanwins:= max(womanwins), by=list(uniquedist, party2)] #make sure both obs from each prim get this
summary(rddprim_top3$womanwins)

###then we want to check which party is the one with the relevant primary, and calc their share of the 2-party voteshare in the general (this time across top 2 cands from each party)
generalstrim_mmd <- generalstrim[generalstrim$uniquedist %in% rddprim_top3$uniquedist, ]; dim(generalstrim_mmd)

generalstrim_mmd[, partycand:= 1:.N, by=c("uniquedist", "partyclean")] #label cands 1/2 (at random) from each party
generalstrim_mmd[, partycandnum:= paste(partyclean, partycand, sep="")]

generalmmd <- reshape(generalstrim_mmd, idvar = c("uniquedist"), timevar = "partycandnum", direction = "wide") #one row per race
generalmmd[, dem2p := (Votes.for.DEM1 + Votes.for.DEM2) /(Votes.for.DEM1 + Votes.for.DEM2+Votes.for.GOP1 + Votes.for.GOP2)] #calc 2party vote share where possible
generalmmd[, rep2p := (Votes.for.GOP1 + Votes.for.GOP2) /(Votes.for.DEM1 + Votes.for.DEM2+Votes.for.GOP1 + Votes.for.GOP2)]
generalmmd[, demtotal := (Votes.for.DEM1 + Votes.for.DEM2)]
generalmmd[, reptotal := (Votes.for.GOP1 + Votes.for.GOP2)]

generalmmd[, dem2palt:= dem2p]; generalmmd[, rep2palt:= rep2p]
generalmmdtrim <- subset(generalmmd, select=c("uniquedist", "rep2p", "dem2p", "rep2palt", "dem2palt", "reptotal", "demtotal"))

rddmergeMMD <- merge(rddprim_top3, generalmmdtrim, by=c("uniquedist"))
dim(rddmergeMMD); dim(rddprim_top3)

##combine with the main rdd dataset, and go back to analysis. 
rddmerge <- rbind(rddmerge, rddmergeMMD, fill=TRUE)

# Set up 2020 dataset for RDD analysis ----
rdset2020 <- unique(rddmerge, by=c("uniquedist", "party2")) #keep just one obs from each primary, which now has all needed info
dim(rdset2020)

#also, make one manual fix about open seat (we missed an incumbent since he was appointed after the primary winner dropped) 
rdset2020[uniquedist=="2020 NC 0 53", "incumbentrunning"] <- 1 #https://ballotpedia.org/North_Carolina_House_of_Representatives_District_53

##now need to see which party had the relevant primary (so which is the correct 2p vote share to use)
rdset2020[party2=="DEM", general2p:= dem2p]
rdset2020[party2=="GOP", general2p:= rep2p]

#also, set up an alternate version not dropping uncontested
rdset2020[party2=="DEM", general2palt:= dem2palt]
rdset2020[party2=="GOP", general2palt:= rep2palt]

#also, set up a binary general win/loss variable? NA if the other major party didn't contest.
rdset2020[party2=="DEM", general2pwin:= demwin]
rdset2020[party2=="GOP", general2pwin:= repwin]
rdset2020[is.na(dem2p) | is.na(rep2p), general2pwin:= NA]

rdset2020[party2=="DEM", generaltotal:= demtotal] #also pull through total vote counts
rdset2020[party2=="GOP", generaltotal:= reptotal]

#set the handful of uncontested-general cases from the MMDs to NA on general2p (consistent with how we're treating all other districts)
rdset2020[rdset2020$general2p==1, "general2p"] <- NA
rdset2020[rdset2020$general2p==0, "general2p"] <- NA

# Create full dataset ----
# Combine 2018, 2019, and 2020 datasets and merge in covars 
rdset2018$upperchamber <-ifelse(rdset2018$chamber=="Upper", 1, 0)
rdset2018$uniquedist <- paste(rdset2018$year, rdset2018$state, rdset2018$upperchamber, rdset2018$dno, sep=" ") #create one district index

rdset2020$party <- ifelse(rdset2020$party2=="DEM", "Democratic", ifelse(rdset2020$party2=="GOP", "Republican", "Other")) #set up party column differently to line up with 2018 data. 

rdset2019$party <- ifelse(rdset2019$Party=="Democrat", "Democratic", "Republican")
rdset2019$year <- 2019
rdset2019$upperchamber <- ifelse(rdset2019$chamber=="Upper", 1,0)
#manually recode this handful of NJ district numbers from words to integers
rdset2019[dno=="Fifteenth", dno:= 15]
rdset2019[dno=="Thirty-Sixth", dno:= 36]
rdset2019[dno=="Twenty-Ninth", dno:= 29]
rdset2019[dno=="Twenty-Second", dno:= 22]

rdset2019$uniquedist <- paste(rdset2019$year, rdset2019$state, rdset2019$upperchamber, rdset2019$dno, sep=" ") #create one district index; rebuild this column to align with 2020 version

columnoverlap <- intersect(colnames(rdset2018), colnames(rdset2020)) #check which columns overlap
columnoverlap #note we will want to be careful with what we use here: the 2020 data does have "race" but it's not actually the column we generally use
columnoverlap2 <- intersect(columnoverlap, colnames(rdset2019)) #now one more layer as we add in 2019
rddsetcombined <- rbind(subset(rdset2020, select = columnoverlap2), subset(rdset2019, select = columnoverlap2), subset(rdset2018, select = columnoverlap2)); dim(rdset2020); dim(rdset2018); dim(rdset2019); dim(rddsetcombined)
table(rddsetcombined$year)
rddset <- rddsetcombined 
rddset <- rddset[party %in% c("Democratic", "Republican"),] #drop one random third-party primary row.
dim(rddset)

## also want to merge in some district-level covariates.
lowercovars <- read.csv("SLDL_Demographics_ACS1418.csv")
uppercovars <- read.csv("SLDU_Demographics_ACS1418.csv")

#also load in updated key of district names
lowernames <- read.csv("SLDL_Key.csv")
uppernames <- read.csv("SLDU_Key.csv")
uppernames[uppernames$State_Abr=="MA", "SLDU"] <- as.character(as.numeric(uppernames[uppernames$State_Abr=="MA", "SLDU"])) #swap out 0's in MA names
lowernames$uniquedist_noyear <- paste(lowernames$State_Abr, 0, lowernames$SLDL, sep=" ") #set up district labels the same way the main file will have, for merge
lowernames$uniquedist_noyear_replace <- paste(lowernames$State_Abr, 0, lowernames$BASENAME, sep=" ") #and now set up an alternate version
uppernames$uniquedist_noyear <- paste(uppernames$State_Abr, 1, uppernames$SLDU, sep=" ") #set up district labels the same way the main file will have, for merge
uppernames$uniquedist_noyear_replace <- paste(uppernames$State_Abr, 1, uppernames$BASENAME, sep=" ") #and now set up an alternate version

## merge in CVAP ests as well.
lowercvap <- read.csv("SLDLC.csv") #from ACS 2015-2019
uppercvap <- read.csv("SLDUC.csv")
lowercvap <- lowercvap[lowercvap$lntitle=="Total",]; dim(lowercvap) #keep only total rows
uppercvap <- uppercvap[uppercvap$lntitle=="Total",]; dim(uppercvap) #keep only total rows

lowercvap$SLDL <- substr(lowercvap$geoid, nchar(lowercvap$geoid)-2, nchar(lowercvap$geoid)) #try setting up like the main covar file
uppercvap$SLDU <- substr(uppercvap$geoid, nchar(uppercvap$geoid)-2, nchar(uppercvap$geoid))
lowercvap$State_FIPS <- as.numeric(substr(lowercvap$geoid, nchar(lowercvap$geoid)-4, nchar(lowercvap$geoid)-3)) #try setting up like the main covar file
uppercvap$State_FIPS <- as.numeric(substr(uppercvap$geoid, nchar(uppercvap$geoid)-4, nchar(uppercvap$geoid)-3))
lowercovarsCVAP <- merge(lowercovars, lowercvap, by=c("SLDL", "State_FIPS")); dim(lowercovars); dim(lowercovarsCVAP) 
uppercovarsCVAP <- merge(uppercovars, uppercvap, by=c("SLDU", "State_FIPS")); dim(uppercovars); dim(uppercovarsCVAP) 

#go back to the original covariate merge with the fuller covariate set
lowercovars <- lowercovarsCVAP
uppercovars <- uppercovarsCVAP

#set up district names as in the main dataset
lowercovars$state <- state.abb[match(lowercovars$State,state.name)] #convert to abbrevs
uppercovars$state <- state.abb[match(uppercovars$State,state.name)] #convert to abbrevs
lowercovars$Upper <- 0; uppercovars$Upper <- 1

lowercovars$distname <- as.numeric(lowercovars$SLDL)
lowercovars[is.na(lowercovars$distname), "distname"] <- lowercovars[is.na(lowercovars$distname),]$SLDL #fill in string name if it's not numeric
uppercovars$distname <- as.numeric(uppercovars$SLDU)
uppercovars[is.na(uppercovars$distname), "distname"] <- uppercovars[is.na(uppercovars$distname),]$SLDU #fill in string name if it's not numeric 

lowercovars$uniquedist_noyear <- paste(lowercovars$state, 0, lowercovars$distname, sep=" ") #create one district index
uppercovars$uniquedist_noyear <- paste(uppercovars$state, 1, uppercovars$distname, sep=" ") #create one district index
rddset$uniquedist_noyear <- substr(rddset$uniquedist, 6, nchar(rddset$uniquedist))
covars <- rbind(lowercovars[, -which(names(lowercovars) %in% c("SLDL"))], uppercovars[, -which(names(uppercovars) %in% c("SLDU"))]) 
dim(covars); dim(lowercovars); dim(uppercovars)
covars <- covars[is.na(covars$state)==F,]; dim(covars) #drop PR observations

rddset$rddval <- 1
rddset_covars <- merge(rddset, covars, by="uniquedist_noyear"); dim(rddset_covars); dim(rddset)
probs <- rddset[!(rddset$uniquedist_noyear %in% rddset_covars$uniquedist_noyear),]; dim(probs)

##now try correcting those problem obs 
allnames <- rbind(subset(lowernames, select=c("uniquedist_noyear", "uniquedist_noyear_replace")),subset(uppernames, select=c("uniquedist_noyear", "uniquedist_noyear_replace"))); dim(allnames); dim(lowernames); dim(uppernames)

covars <- data.table(covars)
rddset[uniquedist_noyear %in% probs$uniquedist_noyear, uniquedist_noyear:= allnames$uniquedist_noyear[match(uniquedist_noyear, allnames$uniquedist_noyear_replace)]] #swap in new district names only in cases where there's a problem, then re-merge. 
#and a couple more manual fixes here, especially for 2018 data:
rddset[uniqueprim =="2018 VT Lower windham 4 NA democrat", uniquedist_noyear:= "VT 0 W-4"]
rddset[uniquedist =="2020 MN 0 4A", uniquedist_noyear:= "MN 0 04A"]
rddset[uniquedist=="2020 ID 0 10A", uniquedist_noyear:="ID 0 10"]
rddset[uniquedist=="2020 ID 0 19B", uniquedist_noyear:="ID 0 19"]
rddset[uniquedist=="2020 ID 0 20A", uniquedist_noyear:="ID 0 20"]
rddset_covars <- merge(rddset, covars, by="uniquedist_noyear"); dim(rddset_covars); dim(rddset)

#also, merge it the other direction and keep all the legislative districts for some comparisons.
covars_rddset18 <- merge(rddset[rddset$year==2018,], covars, by="uniquedist_noyear", all.y=T); dim(covars_rddset18); dim(covars)
covars_rddset20 <- merge(rddset[rddset$year==2020,], covars, by="uniquedist_noyear", all.y=T); dim(covars_rddset20); dim(covars)

covars_rddset19 <- merge(rddset[rddset$year==2019,], covars, by="uniquedist_noyear", all.y=T); dim(covars_rddset19); dim(covars)

covars_rddset18[is.na(covars_rddset18$rddval)==T, "rddval"] <- 0
covars_rddset19[is.na(covars_rddset19$rddval)==T, "rddval"] <- 0
covars_rddset20[is.na(covars_rddset20$rddval)==T, "rddval"] <- 0
summary(covars_rddset18$rddval)

#now set the covars version as the main dataset
rddset <- rddset_covars

# merge in the district vote data BF put together
votedata <- read.csv("2016_PresBySLDist.csv") 

#pull in separate TX/FL files to fix missingness
TXpres <- read.csv("Texas_2016.csv")
dim(TXpres)
dim(votedata[votedata$state=="Texas",]) #check sizes: yes, we were missing a lot in TX
#format this like the main file and swap it in for all of the TX results
TXpres$state <- "Texas"
TXpres$state_fips <- 48
TXpres$Dem_2pyShare <- TXpres$Dem / (TXpres$Dem + TXpres$Rep)

FLpres <- read.csv("Florida_2016.csv")
dim(FLpres); dim(votedata[votedata$state=="Florida",])
FLpres$state <- "Florida"
FLpres$state_fips <- 12
FLpres$Dem_2pyShare <- FLpres$Dem / (FLpres$Dem + FLpres$Rep)

#drop TX and FL from the main vote data and rbind() these new versions in
votedata <- votedata[!(votedata$state %in% c("Florida", "Texas")),]; dim(votedata)
votedata <- rbind(votedata, FLpres, TXpres); dim(votedata)

#make some naming changes so they'll line up with the main dataset
library(textclean)
votedata[votedata$state=="Massachusetts", "district"] <- toupper(replace_ordinal(votedata[votedata$state=="Massachusetts", "district"]))
rddset[state.y=="MA", uniquedist_noyear:= toupper(gsub("&", "and", gsub("-", "", substr(uniquedist, 6,nchar(uniquedist)))))] #and change main dataset to align as well

#now want to set up unique district labels that will match the main dataset for simpler merge
votedata$Upper <- ifelse(votedata$office=="State House", 0, 1); table(votedata$Upper); table(votedata$office)
votedata$uniquedist_noyear <- paste(votedata$state_postal, votedata$Upper, votedata$district, sep=" ") 

rddset_votes <- merge(rddset, votedata, by="uniquedist_noyear"); dim(rddset_votes); dim(rddset)

#now redo the merge to the vote data, keeping all the unmatched as well, so we can keep using the same dataset for everything
dim(rddset) 
rddset <- merge(rddset, votedata, by="uniquedist_noyear", all.x=T); dim(rddset)
summary(rddset$Dem_2pyShare)

#now pull in the last set of fixes & merge in
missingpres <- read.csv("MissingPres_2016.csv")
missingpres$Dem_2pyShare_fill <- missingpres$Dem/(missingpres$Dem+missingpres$Rep)

rddset2 <- merge(rddset, missingpres[, c("geoname", "Dem_2pyShare_fill")], by="geoname", all.x=T); dim(rddset2)
rddset2[, Dem_2pyShare := ifelse(is.na(Dem_2pyShare)==T, Dem_2pyShare_fill, Dem_2pyShare)]
summary(rddset2$Dem_2pyShare)

rddset <- rddset2 

#and also go the opposite direction, so there's an indicator in the main vote dataset of which obs are in our RD set
votedata$rddset <- 0
votedata[votedata$uniquedist_noyear %in% rddset_votes$uniquedist_noyear, "rddset"] <- 1
table(votedata$rddset); dim(rddset_votes); length(unique(rddset_votes$uniquedist_noyear)) 
length(unique(rddset$uniquedist_noyear)) 

#reverse the running variable (so instead of white candidate win margin it's minority win margin)
rddset$minorityprimarymargin <- rddset$whiteprimarymargin * -1

# Clean final dataset ----
# Select variables needed for final analysis
rddset <- rddset %>% 
  select("geoid", "geoname", "distname", "district", "State_FIPS", "state.y", "year.x",
         "general2p", "general2palt", "minorityprimarymargin", "incumbentrunning", "incumbentwins",
         "womanwins", "party", "Dem_2pyShare", "Total.x", "uniquedist_noyear", "uniquedist", 
         "pctWhite", "pctBlack", "RDDBlack", "pctLatino", "RDDLatino", "pctAsian", "RDDAsian", 
         "uncontestedgen", "Upper.x", "mmd2keep")

# Save the final dataset ----
dim(rddset)
write.csv(rddset, "fullRDDdataset20182020_analysis.csv")

